ITM and elevation – preliminary study.

Descriptive stats

Average elevation across different languages

data <- read.csv('elevation_villages_correct.csv')
data %>% 
  group_by(language) %>% 
  slice(which.min(elevation)) %>% 
  select(eng_vil_name, elevation) %>% 
  write.csv('minimum_vil.csv')
## Adding missing grouping variables: `language`
ggplot(data=data, aes(x=fct_reorder(language, elevation, .fun=mean, .desc=TRUE), y=elevation))+
  theme_minimal()+
  # stat_summary(fun.data = mean_se,  
  #                geom = "errorbar", color='black', alpha=0.3)+
  geom_jitter(color='grey', alpha=0.1)+
  stat_summary(aes(color=Family), fun = "mean", geom = "point", size=2)+
  labs(y = "Elevation", x='Language', color='Family')+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  theme(legend.position = "top")+
  scale_color_viridis(discrete = TRUE, option = "D")+
  coord_flip()+
  theme(axis.text.x=element_text(angle=0, hjust=0.5),
        # text = element_text(family = "Andale Mono"),
        legend.position = 'bottom')+
  geom_hline(aes(yintercept=mean(elevation)), color='red', linetype='dotted')+
  guides(colour = guide_legend(nrow = 1))

ggsave('elev_lang.jpg', width = 8, height = 5)
data$log_c <- 0
data$log_c <- log(data$census_1926)

data$el_100 <- plyr::round_any(data$elevation, 1, f = ceiling)

data %>%
  group_by(el_100) %>%
  summarise(mean_c = mean(log_c)) %>%
  ggplot(aes(x=el_100, y=mean_c))+
    # geom_smooth()+
    geom_point()
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning: Removed 2 rows containing missing values (geom_point).

Average elevation across different languages (groupped by family)

ggplot(data=data, aes(x=fct_reorder(language, elevation, .fun=mean, .desc=TRUE), y=elevation))+
  theme_bw()+
  # geom_boxplot(aes(fill=Family))+
  geom_dotplot(aes(fill=Family, color=Family), binaxis = "y", stackdir = "center", position = "dodge", binwidth = 60, method = 'histodot')+
  facet_wrap(vars(Family), scales = "free_y", strip.position = 'top')+
  labs(y = "Elevation", x='Language', size=' 1926 Census\n data')+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))+
  theme(legend.position = "top")+
  scale_color_viridis(discrete = TRUE, option = "D")+
  coord_flip()+
  theme(legend.position = "none",
        legend.key = element_blank(), 
        strip.text = element_text(face="bold"), 
        axis.text.x=element_text(angle=0, hjust=0.5)) 

ggsave('elev_lang.jpg', width = 8, height = 5)
data[data$language == 'Godoberi ',] %>% 
  write.csv('godoberi.csv')

Average elevation across families (groupped by family)

ggplot(data, aes(y=elevation, x=Family))+
  theme_bw()+
  geom_dotplot(aes(fill=Family, color=Family), binaxis = "y", stackdir = "center", position = "dodge", binwidth = 41, method = 'histodot')+
  theme(legend.position = "none",
        legend.key = element_blank(), 
        strip.text = element_text(face="bold"), 
        axis.text.x=element_text(angle=0, hjust=0.5))+
  # stat_summary(fun = "mean", geom = "crossbar", size=0.5, alpha=0.1, color='grey')+
  # geom_hline(aes(yintercept=mean(elevation)), color='red', linetype='dotted')

ggsave('family_dist.jpg', width = 8, height = 5)

Generalized Linear Models

all <- read.csv('all.csv')
kable(head(mtcars))
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
all %>%
  group_by(year_of_birth, sex) %>%
  summarize(count=n()) %>%
  mutate(percentage = count / sum(count)) %>%
  ggplot(aes(x=year_of_birth, y=percentage, fill=sex))+
  geom_area()+
  theme_blank()
## `summarise()` regrouping output by 'year_of_birth' (override with `.groups` argument)

Average elevation across different number of L2’s

ggplot(data = all, aes(y=elevation, x=factor(number.of.lang.strat)))+
  theme_bw()+
  geom_violin(fill='grey', color='grey', trim = TRUE)

# events <- 0:4
# density <- dpois(x = events, lambda = mean(all$number.of.lang.strat))
# 
# ggplot()+
#   geom_point(aes(y=density, x=0:4, color='Poisson distribution'))+
#   geom_line(aes(y=density, x=0:4, color='Poisson distribution'))+
#   ylim(0, 1)+
#   geom_point(aes(x=co$x, y=co$freq/sum(co$freq), color='Actual data'))+
#   geom_line(aes(x=co$x, y=co$freq/sum(co$freq), color='Actual data'))+
#   labs(y = "Probability", x='ITM')

Poisson and linear regressions

fit_p <- glm(number.of.lang.strat ~ elevation + year_of_birth*sex + language.population*residence, family="poisson", data=all)
fit_lin <- glm(number.of.lang.strat ~ elevation + year_of_birth*sex + language.population*residence, data=all)

Linear:

ggplot(ggpredict(fit_lin, terms = c('elevation')), aes(x, predicted)) +
  geom_line()+
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = .1)+
  theme_bw()

  # facet_wrap(~facet)

Poisson:

ggplot(ggpredict(fit_p, terms = c('elevation')), aes(x, predicted)) +
  geom_line()+
  geom_ribbon(aes(ymin = conf.low, ymax = conf.high), alpha = .1)+
  theme_bw()

  # facet_wrap(~facet)
ITM <- read.csv('ITM.csv')
all <- read.csv('all.csv')
all_subset <- all[all$year_of_birth < 1930,]

all_subset$norm_yb <- scale(all_subset$year_of_birth)
all_subset$norm_el <- scale(all_subset$elevation)
all_subset$norm_itm <- scale(all_subset$number.of.lang.strat)
all_subset$norm_pop <- scale(all_subset$language.population)

all$norm_yb <- scale(all$year_of_birth)
all$norm_el <- scale(all$elevation)
all$norm_itm <- scale(all$number.of.lang.strat)
all$norm_pop <- scale(all$language.population)
all$norm_vil_pop <- scale(all$village.population)

pois_2 <- glmer(number.of.lang.strat ~ norm_el + (1|norm_yb:sex) + (1|residence:mother.tongue), data=all_subset, family='poisson')
summary(pois_2)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: 
## number.of.lang.strat ~ norm_el + (1 | norm_yb:sex) + (1 | residence:mother.tongue)
##    Data: all_subset
## 
##      AIC      BIC   logLik deviance df.resid 
##   4301.6   4323.8  -2146.8   4293.6     1921 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.4442 -0.4142 -0.0894  0.2961  3.6068 
## 
## Random effects:
##  Groups                  Name        Variance Std.Dev.
##  norm_yb:sex             (Intercept) 0.0134   0.1157  
##  residence:mother.tongue (Intercept) 0.5030   0.7092  
## Number of obs: 1925, groups:  norm_yb:sex, 139; residence:mother.tongue, 50
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept)  -0.2453     0.1073  -2.286   0.0223 * 
## norm_el       0.3285     0.1068   3.076   0.0021 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##         (Intr)
## norm_el -0.017

Garik M. model:

garik_model <- glmer(number.of.lang.strat ~ sex + norm_pop + (1|norm_yb) + (1|residence), data=all, family='poisson')
summary(garik_model)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: number.of.lang.strat ~ sex + norm_pop + (1 | norm_yb) + (1 |  
##     residence)
##    Data: all
## 
##      AIC      BIC   logLik deviance df.resid 
##  12671.9  12705.8  -6330.9  12661.9     6473 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.2651 -0.5178 -0.1423  0.3334  4.8590 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  norm_yb   (Intercept) 0.1061   0.3257  
##  residence (Intercept) 0.7807   0.8836  
## Number of obs: 6478, groups:  norm_yb, 158; residence, 50
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -0.75257    0.13259  -5.676 1.38e-08 ***
## sexм         0.24847    0.02819   8.814  < 2e-16 ***
## norm_pop    -0.29958    0.11690  -2.563   0.0104 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##          (Intr) sexм  
## sexм     -0.124       
## norm_pop -0.080 -0.001

Normalized elevation vs. normalized village population.

ggplot(data=all, aes(x=norm_el, y=norm_vil_pop))+
  geom_point()

pois_2_pop <- glmer(number.of.lang ~ norm_el + (1|norm_yb) + (1|sex) + (1|residence) + norm_pop + norm_vil_pop, data=all, family='poisson')
summary(pois_2_pop)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: 
## number.of.lang ~ norm_el + (1 | norm_yb) + (1 | sex) + (1 | residence) +  
##     norm_pop + norm_vil_pop
##    Data: all
## 
##      AIC      BIC   logLik deviance df.resid 
##  12659.3  12706.8  -6322.7  12645.3     6471 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.2613 -0.5227 -0.1316  0.3337  4.8098 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  norm_yb   (Intercept) 0.10649  0.3263  
##  residence (Intercept) 0.47514  0.6893  
##  sex       (Intercept) 0.02003  0.1415  
## Number of obs: 6478, groups:  norm_yb, 158; residence, 50; sex, 2
## 
## Fixed effects:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.67170    0.14619  -4.595 4.33e-06 ***
## norm_el       0.28415    0.10318   2.754  0.00589 ** 
## norm_pop     -0.30435    0.09433  -3.227  0.00125 ** 
## norm_vil_pop -0.50435    0.11194  -4.506 6.62e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) norm_l nrm_pp
## norm_el     -0.034              
## norm_pop    -0.041  0.138       
## norm_vil_pp  0.094  0.095  0.125
ITM$norm_yb <- scale(ITM$year_of_birth)
ITM$norm_el <- scale(ITM$elevation)
ITM$norm_itm <- scale(ITM$number.of.lang.strat)
pois_1 <- glmer(number.of.lang.strat ~ norm_el + (1|norm_yb:sex) + (1|residence:mother.tongue), data=ITM, family='poisson')
summary(pois_1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: 
## number.of.lang.strat ~ norm_el + (1 | norm_yb:sex) + (1 | residence:mother.tongue)
##    Data: ITM
## 
##      AIC      BIC   logLik deviance df.resid 
##   8494.0   8519.5  -4243.0   8486.0     4323 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.2029 -0.4526 -0.1267  0.2698  3.8511 
## 
## Random effects:
##  Groups                  Name        Variance Std.Dev.
##  norm_yb:sex             (Intercept) 0.04944  0.2223  
##  residence:mother.tongue (Intercept) 0.74412  0.8626  
## Number of obs: 4327, groups:  norm_yb:sex, 118; residence:mother.tongue, 50
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.6311     0.1281  -4.928 8.33e-07 ***
## norm_el       0.3873     0.1281   3.023  0.00251 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##         (Intr)
## norm_el -0.059

Poisson regression vs. real data

pois_1 <- glmer(number.of.lang.strat ~ norm_el + (1|norm_yb:sex) + (1|residence:mother.tongue), data=ITM, family='poisson')
summary(pois_1)
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: poisson  ( log )
## Formula: 
## number.of.lang.strat ~ norm_el + (1 | norm_yb:sex) + (1 | residence:mother.tongue)
##    Data: ITM
## 
##      AIC      BIC   logLik deviance df.resid 
##   8494.0   8519.5  -4243.0   8486.0     4323 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.2029 -0.4526 -0.1267  0.2698  3.8511 
## 
## Random effects:
##  Groups                  Name        Variance Std.Dev.
##  norm_yb:sex             (Intercept) 0.04944  0.2223  
##  residence:mother.tongue (Intercept) 0.74412  0.8626  
## Number of obs: 4327, groups:  norm_yb:sex, 118; residence:mother.tongue, 50
## 
## Fixed effects:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.6311     0.1281  -4.928 8.33e-07 ***
## norm_el       0.3873     0.1281   3.023  0.00251 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##         (Intr)
## norm_el -0.059
AIC(pois_1)
## [1] 8493.972
ITM_el <- dplyr::select(ITM, elevation, number.of.lang.strat) %>% 
  group_by(elevation) %>% 
  summarise(mean_itm = mean(number.of.lang.strat), count=n())
## `summarise()` ungrouping output (override with `.groups` argument)
kable(head(ITM_el))
elevation mean_itm count
355.0269 0.0000000 94
365.5033 1.0689655 145
407.0000 0.0973451 113
450.7447 0.5662651 83
491.3748 0.8709677 31
667.3964 0.0000000 66
pred <- ggpredict(pois_1, terms = c('norm_el'))
true_sequence <- seq(min(ITM$elevation), max(ITM$elevation), length.out=21)
ggplot()+
  geom_point(data=ITM_el, aes(x=elevation, y=mean_itm, size=count), color='blue', alpha=.5)+
  geom_line(data=pred, aes(x=true_sequence, y=predicted), size=0.8)+
  geom_ribbon(aes(ymin = pred$conf.low, ymax = pred$conf.high, x=true_sequence), alpha = .15)+
  theme_bw()+
  labs(x = "Elevation", y='Average ITM', size=' Number\n of observations')

ggsave('poison_observ.jpg', width = 7, height = 4)

Some geospatial ideas

ITM

itm_coors <- read.csv('itm_coords_no_Hinuq.csv')
itm_coors <- itm_coors[complete.cases(itm_coors), ]
kable(head(itm_coors))
X index Unnamed..0 expedition name number.of.na russian.na sex type year_of_birth year.of.death аварский агульский азербайджанский азербайджанский.или.кумыкский акушинский.даргинский андийский арчинский ахвахский багвалинский бежтинский ботлихский гапшиминский.даргинский гдымско.фийский.лезгинский гинухский грузинский даргинский кадарский.даргинский кайтагский.даргинский каратинский кубачинский.даргинский кумыкский лакский лезгинский мегебский муиринский.даргинский русский рутульский сирхинский.даргинский табасаранский тукитинский хновский.рутульский цахурский цезский цудахарский.даргинский чеченский чирагский.даргинский mother.tongue village.population language.population number.of.lang mother.tongue.match number.of.lang.strat elevation Lat Lon
0 Balkhar 0 Balkhar, Tsulikana, Shukty, Kuli Абедет 0 0 ж 0 1922 2000 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 лакский 1195 31987 0 1 0 1658.484 42.2797 47.2314
1 Balkhar 1 Balkhar, Tsulikana, Shukty, Kuli Абдурахман 0 0 м 0 1923 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 лакский 1195 31987 0 1 0 1658.484 42.2797 47.2314
2 Balkhar 2 Balkhar, Tsulikana, Shukty, Kuli Магомед 0 0 м 0 1923 1995 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 лакский 1195 31987 0 1 0 1658.484 42.2797 47.2314
3 Balkhar 3 Balkhar, Tsulikana, Shukty, Kuli Жума 0 0 ж 0 1924 1984 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 лакский 1195 31987 0 1 0 1658.484 42.2797 47.2314
4 Balkhar 4 Balkhar, Tsulikana, Shukty, Kuli Шаммадаев Калла 0 0 м 0 1924 2004 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 лакский 1195 31987 2 1 2 1658.484 42.2797 47.2314
5 Balkhar 5 Balkhar, Tsulikana, Shukty, Kuli Гусейн 0 0 м 0 1925 2010 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 лакский 1195 31987 0 1 0 1658.484 42.2797 47.2314
itm_vill <- itm_coors %>% 
  group_by(index, Lat, Lon, elevation) %>% 
  summarise(mean_itm = mean(number.of.lang.strat), count=n())
## `summarise()` regrouping output by 'index', 'Lat', 'Lon' (override with `.groups` argument)
kable(head(itm_vill))
index Lat Lon elevation mean_itm count
Balkhar 42.27970 47.23140 1658.4839 0.3296703 91
Bezhta 42.13390 46.12470 1633.0813 1.3636364 143
Burkikhan 41.80636 47.53847 1917.4192 0.7826087 92
Chabanmakhi 42.63784 47.25923 1020.0406 0.6400000 75
Chankurbe 42.63047 47.30022 882.7377 0.5652174 92
Chirag 41.83810 47.43030 2223.1909 0.9718310 71
matrix <- distm(itm_vill[,c('Lon','Lat')], itm_vill[,c('Lon','Lat')], fun=distVincentyEllipsoid) / 1000
matrix[matrix < 19] <- 0
mode(matrix) <- "numeric"
rownames(matrix) <- itm_vill$index
colnames(matrix) <- itm_vill$index
net <- graph.adjacency(matrix, mode = "undirected", weighted = TRUE, diag = FALSE)

itm_vill$knn_dist <- knn(net)$knn

itm_vill$degree <- degree(net)

ggplot(itm_vill, aes(x=factor(degree), y=mean_itm, label=index))+
  theme_bw()+
  geom_boxplot()

itm_vill %>% 
  group_by(degree)  %>% 
  summarise(mean = mean(mean_itm), sd = sd(mean_itm),
            n = n())  %>% 
  mutate(se = sd / sqrt(n),
         lower.ci = mean - qt(1 - (0.05 / 2), n - 1) * se,
         upper.ci = mean + qt(1 - (0.05 / 2), n - 1) * se)  %>% 
  ggplot(aes(x=factor(degree), y=mean))+
    geom_point()+
    geom_errorbar(aes(ymin=lower.ci, ymax=upper.ci))
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning in qt(1 - (0.05/2), n - 1): NaNs produced

## Warning in qt(1 - (0.05/2), n - 1): NaNs produced

itm_vill %>% 
  group_by(degree)  %>% 
  summarise(mean = mean(mean_itm), sd = sd(mean_itm),
            n = n())  %>% 
  mutate(se = sd / sqrt(n),
         lower.ci = mean - qt(1 - (0.05 / 2), n - 1) * se,
         upper.ci = mean + qt(1 - (0.05 / 2), n - 1) * se)  %>% 
  ggplot(aes(x=factor(degree), y=mean))+
    geom_point()+
    geom_errorbar(aes(ymin=lower.ci, ymax=upper.ci))
## `summarise()` ungrouping output (override with `.groups` argument)
## Warning in qt(1 - (0.05/2), n - 1): NaNs produced

## Warning in qt(1 - (0.05/2), n - 1): NaNs produced

matrix <- distm(itm_vill[,c('Lon','Lat')], itm_vill[,c('Lon','Lat')], fun=distVincentyEllipsoid) / 1000
matrix[matrix > 12] <- 0
mode(matrix) <- "numeric"
rownames(matrix) <- itm_vill$index
colnames(matrix) <- itm_vill$index
net <- graph.adjacency(matrix, mode = "undirected", weighted = TRUE, diag = FALSE)

net %>%  
  ggraph(layout = 'kk')+
  geom_edge_link(color = "orange")+
  theme_void()+
  geom_node_point(aes(size=itm_vill$mean_itm, color=itm_vill$elevation)) +  
  geom_node_text(size = 4,
                 aes(label = name), repel=TRUE)+
  labs(size='Mean ITM', color='Elevation')

ggsave('village_graph.jpg', width = 7, height = 4)
matrix <- distm(itm_vill[,c('Lon','Lat')], itm_vill[,c('Lon','Lat')], fun=distVincentyEllipsoid) / 1000
matrix[matrix > 12] <- 0
mode(matrix) <- "numeric"
rownames(matrix) <- itm_vill$index
colnames(matrix) <- itm_vill$index
net <- graph.adjacency(matrix, mode = "undirected", weighted = TRUE, diag = FALSE)

ggraph(graph=net, x=itm_vill$Lon, y=itm_vill$Lat)+
  theme_void()+
  geom_edge_link(color = "orange")+
  geom_node_point(aes(color=itm_vill$mean_itm)) +  
  geom_node_text(size = 4,
                 aes(label = name), repel=TRUE)+
  labs(size='Mean ITM', color='Mean ITM')+
  annotation_map(map_data("world"), fill = NA, colour = "grey50")

ggsave('village_graph_map.jpg', width = 7, height = 4)

Families

data_short <- data
matrix_fam <- distm(data_short[,c('Lon','Lat')], data_short[,c('Lon','Lat')], fun=distVincentyEllipsoid) / 1000
matrix_fam[matrix_fam > 6] <- 0
rownames(matrix_fam) <- data_short$eng_vil_name
colnames(matrix_fam) <- data_short$eng_vil_name
net_fam <- graph.adjacency(matrix_fam, mode = "undirected", weighted = TRUE, diag = FALSE)

net_fam %>%
  # ggraph(layout = 'kk')+
  ggraph(x=data_short$Lon, y=data_short$Lat)+
  geom_edge_link(color = "orange")+
  theme_void()+
  geom_node_point(aes(color=data_short$Family), size=0.5)+
  labs(color='Family')

  # geom_node_text(aes(label = name), repel=TRUE)+
  # ggsave('village_graph_map.pdf', width = 30, height = 30)
data$neighbours_6 <- degree(net_fam)
data$log_c <- log(data$census_1926)

data$elevation_round <- plyr::round_any(data$elevation, 100, f = ceiling)

data %>%
  group_by(elevation_round) %>%
  summarise(neighbours = mean(neighbours_6), n_6 = neighbours_6, size=n(), pop = census_1926) %>%
  ggplot(aes(x=elevation_round, y=neighbours))+
    geom_line()+
    geom_point(aes(y=n_6, size=pop), alpha=0.1, color='blue')+
    theme_bw()
## `summarise()` regrouping output by 'elevation_round' (override with `.groups` argument)
## Warning: Removed 2 rows containing missing values (geom_point).

by_lang <- data  %>%  
  group_by(language, Family)  %>%  
  summarise(Lon = mean(Lon), Lat = mean(Lat), n = n())
## `summarise()` regrouping output by 'language' (override with `.groups` argument)
m_l <- distm(by_lang[,c('Lon','Lat')], by_lang[,c('Lon','Lat')], fun=distVincentyEllipsoid) / 1000
m_l[m_l > 25] <- 0
rownames(m_l) <- by_lang$language
colnames(m_l) <- by_lang$language
net_l <- graph.adjacency(m_l, mode = "undirected", weighted = TRUE, diag = FALSE)

net_l %>%
  ggraph(layout = 'kk')+
  geom_edge_link(color = "orange")+
  theme_void()+
  geom_node_point(aes(color=by_lang$Family))+
  geom_node_text(size = 2,
                aes(label = name), repel=TRUE)+
  labs(color='Family')

How many members of different families are around each village:

data_short <- data
matrix_fam <- distm(data_short[,c('Lon','Lat')], data_short[,c('Lon','Lat')], fun=distVincentyEllipsoid) / 1000
matrix_fam[matrix_fam > 100] <- 0
net_fam <- graph.adjacency(matrix_fam, mode = "undirected", weighted = TRUE, diag = FALSE)

V(net_fam)$label <- data$Family

SameLabel <-  function(e) {
    V(net_fam)[ends(net_fam, e)[1]]$label == V(net_fam)[ends(net_fam, e)[2]]$label }
g2 <- delete_edges(net_fam, which(sapply(E(net_fam), SameLabel)))

data_short$degree <- degree(g2)

ggplot(data=data_short, aes(x=Family, y=degree))+
  geom_boxplot()

  # geom_dotplot(aes(fill=Family, color=Family), binaxis = "y", stackdir = "center", position = "dodge", binwidth = 0.45)

# ggsave('degree_family.jpg', width = 7, height = 4)
data_short  %>% 
  group_by(Family)  %>% 
  summarise(mean = mean(degree))
## `summarise()` ungrouping output (override with `.groups` argument)
## Registered S3 method overwritten by 'cli':
##   method     from    
##   print.boxx spatstat
## # A tibble: 7 x 2
##   Family  mean
##   <fct>  <dbl>
## 1 Andic   487.
## 2 Avar    409.
## 3 Dargwa  586.
## 4 Lak     839.
## 5 Lezgic  340.
## 6 Tsezic  405.
## 7 Turkic  589.
data_itm_fam <- read.csv('itm_coords_Family.csv')
itm_vill_fam <- data_itm_fam %>% 
  group_by(index, Lat, Lon, elevation, Family, mother.tongue) %>% 
  summarise(mean_itm = mean(number.of.lang.strat), count=n())
## `summarise()` regrouping output by 'index', 'Lat', 'Lon', 'elevation', 'Family' (override with `.groups` argument)
itm_vill_fam <- itm_vill_fam[!itm_vill_fam$index == 'Hinuq',]
itm_vill_fam[itm_vill_fam$mother.tongue == 'кадарский даргинский',]$Family <- 'Dargwa'
itm_vill_fam[itm_vill_fam$mother.tongue == 'сирхинский даргинский',]$Family <- 'Dargwa'
itm_vill_fam[itm_vill_fam$mother.tongue == 'цудахарский даргинский',]$Family <- 'Dargwa'
itm_vill_fam[itm_vill_fam$mother.tongue == 'гдымско-фийский лезгинский',]$Family <- 'Lezgic'
itm_vill_fam[itm_vill_fam$mother.tongue == 'цахурский',]$Family <- 'Lezgic'
itm_vill_fam[itm_vill_fam$mother.tongue == 'азербайджанский',]$Family <- 'Turkic'
itm_vill_fam[itm_vill_fam$mother.tongue == 'ахвахский',]$Family <- 'Andic'
itm_vill_fam$Lat <- itm_vill$Lat
itm_vill_fam$Lon <- itm_vill$Lon
matrix <- distm(itm_vill_fam[,c('Lon','Lat')], itm_vill_fam[,c('Lon','Lat')], fun=distVincentyEllipsoid) / 1000
matrix[matrix > 25] <- 0
mode(matrix) <- "numeric"
rownames(matrix) <- itm_vill_fam$index
colnames(matrix) <- itm_vill_fam$index
net <- graph.adjacency(matrix, mode = "undirected", weighted = TRUE, diag = FALSE)
# V(net)$label <- itm_vill_fam$Family
V(net)$label <- itm_vill_fam$mother.tongue
SameLabel <-  function(e) {
    V(net)[ends(net, e)[1]]$label == V(net)[ends(net, e)[2]]$label }
g2 <- delete_edges(net, which(sapply(E(net), SameLabel)))

itm_vill_fam$deg <- degree(g2)

# DiffLabel <-  function(e) {
#     V(net)[ends(net, e)[1]]$label != V(net)[ends(net, e)[2]]$label }
# g3 <- delete_edges(net, which(sapply(E(net), DiffLabel)))
# 
itm_vill_fam$deg <- degree(g2)
# itm_vill_fam$deg_tot <- degree(g3)
# itm_vill_fam$deg <- log(itm_vill_fam$deg_n/itm_vill_fam$deg_tot)

g2 %>%  
  ggraph(layout = 'kk')+
  geom_edge_link(color = "orange", alpha=0.5)+
  theme_void()+
  geom_node_point(aes(color=itm_vill_fam$Family))+
  scale_color_brewer(palette="Dark2")+
  labs(color='Family')

# ggsave('village_graph.jpg', width = 7, height = 4)
ggplot(itm_vill_fam, aes(y=mean_itm, x=deg))+
  geom_point()

ggplot(itm_vill_fam, aes(x=Family, y=deg))+
  geom_boxplot()+
  labs(x='Degree')

itm_deg_m <- lmer(mean_itm ~ deg + (1|Family:mother.tongue) + count, itm_vill_fam)
summary(itm_deg_m)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: mean_itm ~ deg + (1 | Family:mother.tongue) + count
##    Data: itm_vill_fam
## 
## REML criterion at convergence: 54.5
## 
## Scaled residuals: 
##      Min       1Q   Median       3Q      Max 
## -1.25696 -0.55455  0.05043  0.40487  2.47483 
## 
## Random effects:
##  Groups               Name        Variance Std.Dev.
##  Family:mother.tongue (Intercept) 0.12227  0.3497  
##  Residual                         0.05217  0.2284  
## Number of obs: 49, groups:  Family:mother.tongue, 28
## 
## Fixed effects:
##              Estimate Std. Error        df t value Pr(>|t|)   
## (Intercept)  0.519597   0.175582 45.905846   2.959  0.00486 **
## deg          0.019170   0.023068 45.390793   0.831  0.41032   
## count        0.001549   0.001098 35.404594   1.412  0.16682   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##       (Intr) deg   
## deg   -0.676       
## count -0.679  0.128
pred <- ggpredict(itm_deg_m, terms = c('deg'))

ggplot()+
  geom_point(data=itm_vill_fam, aes(y=mean_itm, x=deg, color=Family))+
  geom_text_repel(data=itm_vill_fam, aes(label=index, y=mean_itm, x=deg))+
  geom_line(data=pred, aes(x=pred$x, y=predicted), size=0.8)+
  geom_ribbon(aes(ymin = pred$conf.low, ymax = pred$conf.high, x=pred$x), alpha = .15)+
  scale_color_brewer(palette="Dark2")

Data from bigger dataset:

overlap <- data_short[data_short$eng_vil_name %in% itm_vill_fam$index,]$eng_vil_name
ov_itm_vill_fam <- itm_vill_fam[itm_vill_fam$index %in% overlap,]
data_all <- data_short[data_short$eng_vil_name %in% itm_vill_fam$index,]
itm_vill_fam_m <- merge(x= ov_itm_vill_fam, y = data_all[, c('eng_vil_name', 'degree')], by.x='index', by.y = 'eng_vil_name', all=TRUE, copy=TRUE)

itm_deg <- lmer(mean_itm ~ degree + (1|Family:mother.tongue) + count, itm_vill_fam_m)
summary(itm_deg)
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: mean_itm ~ degree + (1 | Family:mother.tongue) + count
##    Data: itm_vill_fam_m
## 
## REML criterion at convergence: 60.5
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.7488 -0.5158  0.2256  0.5314  1.9616 
## 
## Random effects:
##  Groups               Name        Variance Std.Dev.
##  Family:mother.tongue (Intercept) 0.06150  0.2480  
##  Residual                         0.09047  0.3008  
## Number of obs: 41, groups:  Family:mother.tongue, 25
## 
## Fixed effects:
##               Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)  1.0701453  0.2604493 28.9692203   4.109 0.000298 ***
## degree      -0.0009389  0.0004361 23.1926888  -2.153 0.041951 *  
## count        0.0016051  0.0012567 35.5238115   1.277 0.209824    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##        (Intr) degree
## degree -0.841       
## count  -0.442 -0.026
pred <- ggpredict(itm_deg, terms = c('degree'))

ggplot()+
  geom_point(data=itm_vill_fam_m, aes(y=mean_itm, x=degree, color=Family))+
  geom_text_repel(data=itm_vill_fam_m, aes(label=index, y=mean_itm, x=degree))+
  geom_line(data=pred, aes(x=pred$x, y=predicted), size=0.8)+
  geom_ribbon(aes(ymin = pred$conf.low, ymax = pred$conf.high, x=pred$x), alpha = .15)+
  scale_color_brewer(palette="Dark2")